In [1]:
import os
import sys

# print(f"Current Working Directory --> {os.getcwd()}")
#Add one directory above research
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..")) # Get the parent directory
sys.path.append(parent_dir)
current_working_dir = %pwd

print(f"Parent Dir >>> {parent_dir}")
print(f"Current Working Dir >>> {current_working_dir}")

# from configs import cfgs  # Absolute import
Parent Dir >>> C:\Users\maz\dev\Projects_\alzheimer
Current Working Dir >>> C:\Users\maz\dev\Projects_\alzheimer\research
In [2]:
import warnings

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

Imports¶

In [3]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.pipeline import Pipeline

# Custom Functions
from utils import *

Paths¶

In [4]:
# Create a path object
# dataset_dir = cfgs["DATASET_DIR"]
dataset_dir = "..//dataset//modified"
dataset_path = Path(dataset_dir)
print("Dataset Path")
print("*"*12)
print(f"Dataset: {dataset_path}")
print("\n")

# Find all CSV files inside the directory
files = list(dataset_path.glob("*.csv"))

print("Files in Dataset Dir:")
print("*"*21)
for file_path in files:
    print(file_path.name)  # Print only the file name


# Combining multiple paths
# path_metadata = dataset_path / "MetaData.xlsx"
path_train = dataset_path / "train.csv"
path_test = dataset_path / "test.csv"
path_train_cleaned = dataset_path / "train_v01.csv"

print("\n")
print(f"Train File Path --> {path_train}")
print(f"Train File Path | Cleaner Version --> {path_train_cleaned}")
# print("\n")
print(f"Test File Path --> {path_test}")
Dataset Path
************
Dataset: ..\dataset\modified


Files in Dataset Dir:
*********************
test.csv
test_features.csv
test_labels.csv
train.csv
train_features.csv
train_labels.csv
train_without_featEng.csv
train_with_featEng.csv


Train File Path --> ..\dataset\modified\train.csv
Train File Path | Cleaner Version --> ..\dataset\modified\train_v01.csv
Test File Path --> ..\dataset\modified\test.csv

Data Loading¶

In [5]:
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)
In [6]:
try:
    dfTrain = pd.read_csv(path_train, encoding = 'utf8')
    display(dfTrain.head(2))
    print(dfTrain.shape)
except FileNotFoundError:
    print("Error: 'train.csv' not found. Please ensure the file is in the correct location.")
    dfTrain = None
UID Year composite_score Age_03 Urban_03 Married_03 Marriages_03 Education_03 Num_Living_Child_03 Migration_03 ... Meet_FnF_12 SocialActivities_12 AttendReligiousServices_12 a16a_12 YrsLivedInUSA_12 a22_12 a33b_12 SpeaksEnglish_12 HousingEnvironment_12 PredictionYear
0 aard 2021 104 50-59 Urban Widowed 1.0 7-9 Years 1 or 2 0.0 ... Once a week Never 1.0 NaN NaN NaN NaN 0.0 Concrete 9
1 abme 2021 106 50-59 Rural Married or In Civil Union 1.0 1-5 Years 5 or 6 0.0 ... Never Never 0.0 NaN NaN NaN NaN 0.0 Concrete 9

2 rows × 185 columns

(2889, 185)

Shape¶

In [7]:
print(f"In the training data we have " + str(dfTrain.shape[0]) + " rows", str(dfTrain.shape[1]) + " columns")
In the training data we have 2889 rows 185 columns

Examine data¶

In [8]:
#dfTrain.info(verbose=True, show_counts=True)

Features Data Type Conversion¶

  • We will convert Year data type to category for memory efficiency
  • We will convert columns with object type to category if those have low cardinality
  • Also there are some columns whose infered data type is float64 but actually these are boolean type. We will convert them to Categorical since Boolean variables are categorical by nature (True, False, NA)

Feature Type

- Convert to Category¶
In [9]:
# Get all dtypes as a Series
all_dtypes = dfTrain.dtypes
print("\nAll dtypes (Series):")
print(all_dtypes)
print("-" * 30)

# Get unique dtypes
unique_dtypes = dfTrain.dtypes.unique()
print("\nUnique dtypes (NumPy array of dtype objects):")
print(unique_dtypes)
print("-" * 30)
All dtypes (Series):
UID                       object
Year                       int64
composite_score            int64
Age_03                    object
Urban_03                  object
                          ...   
a22_12                    object
a33b_12                   object
SpeaksEnglish_12         float64
HousingEnvironment_12     object
PredictionYear             int64
Length: 185, dtype: object
------------------------------

Unique dtypes (NumPy array of dtype objects):
[dtype('O') dtype('int64') dtype('float64')]
------------------------------
In [10]:
# print("*" * 44)
# print(f"Converting `object` data types to `Category`")
# print("*" * 44)

# # Convert to 'Year ' Column to category
# df['Year'] = df['Year'].astype('category')

# # Convert `Object` data type to Category. Since reading from .csv pandas don't infer them automatically as `Category`
# df, converted_columns_train = identify_and_convert_object_to_category(df, threshold_ratio=0.1, max_unique=50)

# print("\n--- After Conversion ---")
# print("Examine Converted DataFrame dtypes:")
# print(df.dtypes)
# print("\nColumns converted to 'category':", converted_columns_train, "\n")

# # Check the categories in a converted column
# if 'Age_Group' in converted_columns_train:
#     print(f"\nCategories in 'Age_Group' column: {df['Age_Group'].cat.categories.tolist()}")
# if 'Urban_Status' in converted_columns_train:
#     print(f"Categories in 'Urban_Status' column: {df['Urban_Status'].cat.categories.tolist()}")

# print("*" * 65)
# print(f"Converting `Float` data types that have `0` and `1` to `Category`")
# print("*" * 65)
# # Convert Boolean to Category
# df, cat_cols = convert_float_to_bool(df)
# df, bool_cols = convert_boolean_to_category(df)
In [11]:
# --- Define the Pipeline ---
# Note: The order matters if transformations depend on previous ones,
# though in this case, they mostly operate on distinct initial dtypes.

print("*" * 80)
data_type_conversion_pipeline = Pipeline([
    ('specific_categorizer', SpecificColumnCategorizer(columns_to_categorize=['Year'])),
    ('object_to_category', ObjectToCategoryTransformer(threshold_ratio=0.1, max_unique=50)), # Adjusted threshold for sample
    ('float_to_category', FloatToCategoryTransformer()),
    # ('bool_to_category', BooleanToCategoryTransformer())
])
print(f"PipeLine | Data Types Conversion: {data_type_conversion_pipeline}")
print("*" * 80)
********************************************************************************
PipeLine | Data Types Conversion: Pipeline(steps=[('specific_categorizer',
                 SpecificColumnCategorizer(columns_to_categorize=['Year'])),
                ('object_to_category', ObjectToCategoryTransformer()),
                ('float_to_category', FloatToCategoryTransformer())])
********************************************************************************
In [12]:
%%capture
# --- Apply the Pipeline ---
print("*" * 49)
print("--- Applying Pipeline | Data Types Conversion ---")
print("*" * 49)

df = data_type_conversion_pipeline.fit_transform(dfTrain)
In [13]:
print(f"--- PipeLine Completed ---")
--- PipeLine Completed ---
In [14]:
print("*" * 30)
print("After Conversion of Data Types")
print("*" * 30)
# Get all dtypes as a Series
all_dtypes = df.dtypes
print("-" * 20)
print("All dtypes (Series):")
print("-" * 20)
print(f"All Data Types -> {all_dtypes}")

# Get unique dtypes
unique_dtypes = df.dtypes.unique()
print("-" * 45)
print("Unique dtypes (NumPy array of dtype objects):")
print("-" * 45)
print(f"Unique Data Types -> {unique_dtypes}")
******************************
After Conversion of Data Types
******************************
--------------------
All dtypes (Series):
--------------------
All Data Types -> UID                        object
Year                     category
composite_score             int64
Age_03                   category
Urban_03                 category
                           ...   
a22_12                   category
a33b_12                  category
SpeaksEnglish_12         category
HousingEnvironment_12    category
PredictionYear              int64
Length: 185, dtype: object
---------------------------------------------
Unique dtypes (NumPy array of dtype objects):
---------------------------------------------
Unique Data Types -> [dtype('O')
 CategoricalDtype(categories=[2016, 2021], ordered=False, categories_dtype=int64)
 dtype('int64')
 CategoricalDtype(categories=['49 or younger', '50-59', '60-69', '70-79', '80+'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Rural', 'Urban'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Married or In Civil Union', 'Separated or Divorced',
                   'Single', 'Widowed'],
 , ordered=False, categories_dtype=object)
 dtype('float64')
 CategoricalDtype(categories=['1-5 Years', '10+ Years', '6 Years', '7-9 Years',
                   'No education'],
 , ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['1 or 2', '3 or 4', '5 or 6', '7+', 'No children'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=[0.0, 1.0], ordered=False, categories_dtype=float64)
 CategoricalDtype(categories=['Excellent', 'Fair', 'Good', 'Poor', 'Very Good'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Morbidly Obese', 'Normal Weight', 'Obese', 'Over Weight',
                   'Under Weight'],
 , ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Both', 'Patient', 'Spouse'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Currently Looking for Work', 'Currently Working',
                   'Dedicated to Household Chores',
                   'Retired, Incapacitated, or Does not Work'],
 , ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['50-59', '60-69', '70-79', '80+'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Agrees', 'Disagrees', 'Neither Agrees nor Disagrees'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Man', 'Woman'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['More than Primary', 'Primary', 'Some Primary'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Administrative Support Staff',
                   'Artisans and Workers in Production, Repair, Maintenance',
                   'Asst/Laborers etc in Ind. Production, Repair, Maintenance',
                   'Department Heads/Coordinators/Supervisors in Admin and Service Activities',
                   'Domestic Workers',
                   'Drivers and Asst Drivers of Mobile Machinery and Transport Vehicles',
                   'Educators', 'Merchants and Sales Representatives',
                   'Officials and Directors Public, Private, and Social Sectors',
                   'Operators of Fixed Machinery and Equipment for Ind. Production',
                   'Other Workers', 'Professionals',
                   'Safety and Security Personnel', 'Technicians',
                   'Traveling Salespeople and Traveling Salespeople of Services',
                   'Workers in Agriculture, Livestock, Forestry, and Fishing',
                   'Workers in Art, Shows, and Sports',
                   'Workers in the Service Industry'],
 , ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Family', 'Health', 'Laid off', 'Other', 'Retired'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Not Important', 'Somewhat Important', 'Very Important'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Administrative Support Staff',
                   'Artisans and Workers in Production, Repair, Maintenance',
                   'Asst/Laborers etc in Ind. Production, Repair, Maintenance',
                   'Bosses/Supervisors etc in Artistic, Ind. Production, Repair, Maintenance Activities',
                   'Department Heads/Coordinators/Supervisors in Admin and Service Activities',
                   'Domestic Workers',
                   'Drivers and Asst Drivers of Mobile Machinery and Transport Vehicles',
                   'Educators', 'Merchants and Sales Representatives',
                   'Officials and Directors Public, Private, and Social Sectors',
                   'Operators of Fixed Machinery and Equipment for Ind. Production',
                   'Professionals', 'Safety and Security Personnel',
                   'Technicians',
                   'Traveling Salespeople and Traveling Salespeople of Services',
                   'Workers in Agriculture, Livestock, Forestry, and Fishing',
                   'Workers in Art, Shows, and Sports',
                   'Workers in the Service Industry'],
 , ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['2 or 3 Times a Week', '4 or Moretimes a Week',
                   'Almost Everyday', 'Almost Never, Sporadic',
                   'Every other Week', 'Never', 'Once a Month', 'Once a Week',
                   'Once a week'],
 , ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['2 or 3 Times a Week', '4 or Moretimes a week',
                   'Almost Everyday', 'Almost Never, Sporadic',
                   'Every other Week', 'Never', 'Once a Month', 'Once a week'],
 , ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Agriculture/ Animal breeding 01',
                   'Childcare or domestic work 04',
                   'Construction/ Manufacturing/ Mining 02', 'Did not work 08',
                   'Gardening or maintenance 03', 'Other 07',
                   'Restaurant/ Store/ Hotel 05'],
 , ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Citizen', 'Neither', 'Permanent Resident'], ordered=False, categories_dtype=object)
 CategoricalDtype(categories=['Concrete', 'Mud', 'Wood, Mosaic, or other Covering'], ordered=False, categories_dtype=object)]
In [15]:
df.info(verbose=True, show_counts=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 185 columns):
 #    Column                      Non-Null Count  Dtype   
---   ------                      --------------  -----   
 0    UID                         2889 non-null   object  
 1    Year                        2889 non-null   category
 2    composite_score             2889 non-null   int64   
 3    Age_03                      2887 non-null   category
 4    Urban_03                    2889 non-null   category
 5    Married_03                  2889 non-null   category
 6    Marriages_03                2861 non-null   float64 
 7    Education_03                2875 non-null   category
 8    Num_Living_Child_03         2872 non-null   category
 9    Migration_03                2887 non-null   category
 10   GlobalHealth_03             2721 non-null   category
 11   ADL_Dress_03                2722 non-null   category
 12   ADL_Walk_03                 2880 non-null   category
 13   ADL_Bath_03                 2880 non-null   category
 14   ADL_Eat_03                  2878 non-null   category
 15   ADL_Bed_03                  2880 non-null   category
 16   ADL_Toilet_03               2880 non-null   category
 17   Num_ADL_03                  2878 non-null   float64 
 18   IADL_Money_03               2722 non-null   category
 19   IADL_Meds_03                2722 non-null   category
 20   IADL_Shop_03                2722 non-null   category
 21   IADL_Meals_03               2722 non-null   category
 22   Num_IADL_03                 2722 non-null   float64 
 23   Depressed_03                2718 non-null   category
 24   Hard_03                     2721 non-null   category
 25   Restless_03                 2721 non-null   category
 26   Happy_03                    2714 non-null   category
 27   Lonely_03                   2721 non-null   category
 28   Enjoy_03                    2714 non-null   category
 29   Sad_03                      2722 non-null   category
 30   Tired_03                    2722 non-null   category
 31   Energetic_03                2713 non-null   category
 32   Num_CES-D_Symptoms_03       2715 non-null   float64 
 33   CES-D_Symptoms_03           2715 non-null   category
 34   Hypertension_03             2889 non-null   category
 35   Diabetes_03                 2889 non-null   category
 36   Respiratory_Illness_03      2889 non-null   category
 37   Arthritis_03                2889 non-null   category
 38   HeartAttack_03              2887 non-null   category
 39   Stroke_03                   2889 non-null   category
 40   Cancer_03                   2889 non-null   category
 41   Num_Illnesses_03            2887 non-null   float64 
 42   BMI_03                      2041 non-null   category
 43   Exercise_03                 2722 non-null   category
 44   Alcohol_03                  2887 non-null   category
 45   Tobacco_03                  2888 non-null   category
 46   Test_Cholestrol_03          2715 non-null   category
 47   Test_Tuber_03               2708 non-null   category
 48   Test_Diabetes_03            2720 non-null   category
 49   Test_BloodPress_03          2720 non-null   category
 50   Hospitalized_03             2889 non-null   category
 51   Visit_Dr_03                 2885 non-null   category
 52   OutPatient_03               2889 non-null   category
 53   Visit_Dental_03             2889 non-null   category
 54   imss_03                     2889 non-null   category
 55   issste_03                   2889 non-null   category
 56   pem_def_mar_03              2889 non-null   category
 57   insur_private_03            2889 non-null   category
 58   insur_other_03              2889 non-null   category
 59   Insured_03                  2889 non-null   category
 60   FamilyDecisions_03          2046 non-null   category
 61   Employment_03               2886 non-null   category
 62   Age_12                      2782 non-null   category
 63   Urban_12                    2782 non-null   category
 64   Married_12                  2782 non-null   category
 65   Marriages_12                2749 non-null   float64 
 66   Education_12                2768 non-null   category
 67   Num_Living_Child_12         2756 non-null   category
 68   Migration_12                2782 non-null   category
 69   GlobalHealth_12             2682 non-null   category
 70   ADL_Dress_12                2682 non-null   category
 71   ADL_Walk_12                 2773 non-null   category
 72   ADL_Bath_12                 2766 non-null   category
 73   ADL_Eat_12                  2771 non-null   category
 74   ADL_Bed_12                  2772 non-null   category
 75   ADL_Toilet_12               2772 non-null   category
 76   Num_ADL_12                  2763 non-null   float64 
 77   IADL_Money_12               2681 non-null   category
 78   IADL_Meds_12                2681 non-null   category
 79   IADL_Shop_12                2681 non-null   category
 80   IADL_Meals_12               2681 non-null   category
 81   Num_IADL_12                 2680 non-null   float64 
 82   Depressed_12                2676 non-null   category
 83   Hard_12                     2681 non-null   category
 84   Restless_12                 2680 non-null   category
 85   Happy_12                    2670 non-null   category
 86   Lonely_12                   2679 non-null   category
 87   Enjoy_12                    2676 non-null   category
 88   Sad_12                      2677 non-null   category
 89   Tired_12                    2680 non-null   category
 90   Energetic_12                2679 non-null   category
 91   Num_CES-D_Symptoms_12       2660 non-null   float64 
 92   CES-D_Symptoms_12           2660 non-null   category
 93   Hypertension_12             2777 non-null   category
 94   Diabetes_12                 2777 non-null   category
 95   Respiratory_Illness_12      2779 non-null   category
 96   Arthritis_12                2776 non-null   category
 97   HeartAttack_12              2778 non-null   category
 98   Stroke_12                   2780 non-null   category
 99   Cancer_12                   2777 non-null   category
 100  Num_Illnesses_12            2762 non-null   float64 
 101  BMI_12                      2470 non-null   category
 102  Exercise_12                 2682 non-null   category
 103  Alcohol_12                  2782 non-null   category
 104  Tobacco_12                  2782 non-null   category
 105  Test_Cholestrol_12          2672 non-null   category
 106  Test_Tuber_12               2639 non-null   category
 107  Test_Diabetes_12            2678 non-null   category
 108  Test_BloodPress_12          2680 non-null   category
 109  Hospitalized_12             2782 non-null   category
 110  Visit_Dr_12                 2775 non-null   category
 111  OutPatient_12               2780 non-null   category
 112  Visit_Dental_12             2775 non-null   category
 113  imss_12                     2779 non-null   category
 114  issste_12                   2781 non-null   category
 115  pem_def_mar_12              2782 non-null   category
 116  insur_private_12            2780 non-null   category
 117  insur_other_12              2780 non-null   category
 118  Insured_12                  2782 non-null   category
 119  FamilyDecisions_12          1739 non-null   category
 120  Employment_12               2782 non-null   category
 121  Vax_Flu_12                  2673 non-null   category
 122  Vax_Pneu_12                 2612 non-null   category
 123  seg_pop_12                  2782 non-null   category
 124  CareAdult_12                2682 non-null   category
 125  CareChild_12                2680 non-null   category
 126  Volunteer_12                2680 non-null   category
 127  AttendsClass_12             2681 non-null   category
 128  AttendsClub_12              2682 non-null   category
 129  Reads_12                    2672 non-null   category
 130  Games_12                    2680 non-null   category
 131  TableGames_12               2680 non-null   category
 132  UseElectronicDevices_12     2681 non-null   category
 133  HouseMaintenance_12         2682 non-null   category
 134  TV_12                       2682 non-null   category
 135  Sewing_12                   2682 non-null   category
 136  Satement_Ideal_12           2626 non-null   category
 137  Satement_Excel_12           2659 non-null   category
 138  Satement_Fine_12            2676 non-null   category
 139  COSAS_IMP_12                2676 non-null   category
 140  WouldntChange_12            2653 non-null   category
 141  Memory_12                   2658 non-null   category
 142  Gender                      2889 non-null   category
 143  EducationMother             1172 non-null   category
 144  EducationFather             1344 non-null   category
 145  SpouseGender_03             2260 non-null   category
 146  JobHrsWeekly_03             1451 non-null   float64 
 147  JobCatLongest_03            400 non-null    category
 148  YrJobEnded_03               456 non-null    float64 
 149  ReasonJobEnded_03           471 non-null    category
 150  Earnings_03                 2886 non-null   float64 
 151  SpouseEarnings_03           2186 non-null   float64 
 152  hincome_03                  2859 non-null   float64 
 153  hinc_business_03            2888 non-null   float64 
 154  hinc_rent_03                2888 non-null   float64 
 155  hinc_assets_03              2888 non-null   float64 
 156  hinc_cap_03                 2888 non-null   float64 
 157  Pension_03                  2886 non-null   float64 
 158  SpousePension_03            2186 non-null   float64 
 159  Religon_Imp_03              2687 non-null   category
 160  SpouseGender_12             1778 non-null   category
 161  JobHrsWeekly_12             909 non-null    float64 
 162  JobCatLongest_12            1161 non-null   category
 163  YrJobEnded_12               288 non-null    float64 
 164  ReasonJobEnded_12           298 non-null    category
 165  Earnings_12                 2782 non-null   float64 
 166  SpouseEarnings_12           1778 non-null   float64 
 167  hincome_12                  2752 non-null   float64 
 168  hinc_business_12            2782 non-null   float64 
 169  hinc_rent_12                2782 non-null   float64 
 170  hinc_assets_12              2782 non-null   float64 
 171  hinc_cap_12                 2782 non-null   float64 
 172  Pension_12                  2782 non-null   float64 
 173  SpousePension_12            1778 non-null   float64 
 174  Religon_Imp_12              2677 non-null   category
 175  Meet_FnF_12                 2675 non-null   category
 176  SocialActivities_12         2682 non-null   category
 177  AttendReligiousServices_12  2680 non-null   category
 178  a16a_12                     33 non-null     float64 
 179  YrsLivedInUSA_12            53 non-null     float64 
 180  a22_12                      46 non-null     category
 181  a33b_12                     53 non-null     category
 182  SpeaksEnglish_12            2679 non-null   category
 183  HousingEnvironment_12       2800 non-null   category
 184  PredictionYear              2889 non-null   int64   
dtypes: category(148), float64(34), int64(2), object(1)
memory usage: 1.2+ MB

Descriptive statistics for numerical features¶

In [16]:
print("*" * 46)
print("Descriptive Statistics for Numerical Features:")
print("*" * 46)
print("\n")

display(df.describe())
**********************************************
Descriptive Statistics for Numerical Features:
**********************************************


composite_score Marriages_03 Num_ADL_03 Num_IADL_03 Num_CES-D_Symptoms_03 Num_Illnesses_03 Marriages_12 Num_ADL_12 Num_IADL_12 Num_CES-D_Symptoms_12 ... hincome_12 hinc_business_12 hinc_rent_12 hinc_assets_12 hinc_cap_12 Pension_12 SpousePension_12 a16a_12 YrsLivedInUSA_12 PredictionYear
count 2889.000000 2861.000000 2878.000000 2722.000000 2715.000000 2887.000000 2749.000000 2763.000000 2680.000000 2660.000000 ... 2.752000e+03 2.782000e+03 2.782000e+03 2782.000000 2.782000e+03 2782.000000 1.778000e+03 33.000000 53.000000 2889.000000
mean 146.141918 1.126879 0.070883 0.045555 3.479190 0.971943 1.197890 0.212450 0.143657 3.366917 ... 8.166788e+04 2.999641e+04 6.470165e+02 833.932423 3.147376e+04 15920.201294 1.513498e+04 1973.848485 7.490566 7.243337
std 59.078730 0.469691 0.402262 0.285861 2.648645 0.956563 0.611733 0.665236 0.478618 2.624120 ... 7.197490e+05 6.912140e+05 2.812897e+04 11193.041142 6.918793e+05 46219.907592 5.433393e+04 18.246783 11.943171 2.387346
min 8.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... -1.900000e+05 0.000000e+00 -2.100000e+05 0.000000 -2.100000e+05 0.000000 0.000000e+00 1942.000000 1.000000 4.000000
25% 105.000000 1.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 1.000000 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000 0.000000e+00 1960.000000 1.000000 4.000000
50% 146.000000 1.000000 0.000000 0.000000 3.000000 1.000000 1.000000 0.000000 0.000000 3.000000 ... 2.000000e+04 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000 0.000000e+00 1970.000000 3.000000 9.000000
75% 186.000000 1.000000 0.000000 0.000000 5.000000 2.000000 1.000000 0.000000 0.000000 5.000000 ... 6.000000e+04 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 20000.000000 0.000000e+00 1987.000000 7.000000 9.000000
max 334.000000 5.000000 5.000000 4.000000 9.000000 5.000000 7.000000 5.000000 4.000000 9.000000 ... 3.602000e+07 3.600000e+07 1.200000e+06 360000.000000 3.600000e+07 960000.000000 1.200000e+06 2012.000000 52.000000 9.000000

8 rows × 36 columns

Examine Categorical Features (for Year 2003 and 2012)¶

In [17]:
cols = categorize_columns_by_suffix(df)
cols_03 = cols['cols_03']
cols_12 = cols['cols_12']
cols_rest = cols['cols_rest']
cols_rest.remove("UID")
- Categorical features in columns with suffix _03¶
In [18]:
# Get the `_03` suffix columns
cat_cols_03 = df[cols_03].select_dtypes(include=['object', 'category']).columns.to_list()

# # Now Check for Unique Values and Counts
# print("*" * 80)
# print("Unique Values and Counts for Categorical Features in Columns with Suffix '_03'")
# print("*" * 80)
# for col in cat_cols_03:
#     print(f"\nColumn: {col}")
#     print("*" * 30)
#     display(df_cat_03[col].value_counts())

####  Plot in Seaborn ####
pd.set_option('future.no_silent_downcasting', True)
plot_categorical_distributions_grid(df, cat_cols_03, n_cols=3, figsize_per_plot=(10, 5), title_fontsize=18, axis_labelsize=16, tick_fontsize=16)

#### Plot in Plotly #### 
# plot_categorical_distributions_plotly(df, cat_cols_03, width=900, height=450)
--- Plotting Categorical Distributions (Grid Layout) ---
Number of valid columns: 58
No description has been provided for this image
- Categorical features in columns with suffix _12¶
In [19]:
# Get the `_12` suffix columns
cat_cols_12 = df[cols_12].select_dtypes(include=['object', 'category']).columns.to_list()

#### Plot in Seaborn ####
pd.set_option('future.no_silent_downcasting', True)
plot_categorical_distributions_grid(df, cat_cols_12, n_cols=3, figsize_per_plot=(10, 5), title_fontsize=18, axis_labelsize=16, tick_fontsize=16)

#### Plot in Plotly ####
# plot_categorical_distributions_plotly(df_cat_12, cat_cols_12, width=800, height=400)
--- Plotting Categorical Distributions (Grid Layout) ---
Number of valid columns: 86
No description has been provided for this image

- Categorical features in rest of the columns¶

In [20]:
# Get the category columns besides columns with suffix  `_03` and `_12`
cat_cols_rest = df[cols_rest].select_dtypes(include=['object', 'category']).columns.to_list()

####  Plot in Seaborn ####
pd.set_option('future.no_silent_downcasting', True)
plot_categorical_distributions_grid(df[cols_rest], cat_cols_rest, n_cols=2, figsize_per_plot=(12, 6), 
                                    title_fontsize=14, axis_labelsize=12, tick_fontsize=12)

#### Plot in Plotly #### 
# plot_categorical_distributions_plotly(df[cols_rest], cat_cols_rest, width=900, height=450)
--- Plotting Categorical Distributions (Grid Layout) ---
Number of valid columns: 4
No description has been provided for this image

Examine Numerical Features (For Year 2003 & 2012)¶

- Numerical Features in columns with suffix _03¶
In [21]:
num_cols_03 = df[cols_03].select_dtypes(include=['number']).columns.to_list()
# df[num_cols_03].hist(bins = 30, figsize = (20, 20), color = 'b');

#### Plot Histogram and Violin ####
plot_hist_grid(df, num_cols_03, bins=30, n_cols=4, figsize_per_plot=(5, 4), 
               title_fontsize=14, xlabel_fontsize=11, ylabel_fontsize=11, tick_fontsize=9
              )

plot_violin_grid(df, 
                 numeric_cols=num_cols_03, n_cols=2, figsize_per_plot=(10, 5), 
                 title_fontsize=18, tick_fontsize=14, show_outliers=True
                )
Plotting histograms...Total plots: 16

No description has been provided for this image
Plotting Violin Plots...Number of Plots: 16

No description has been provided for this image
- Numerical Features in columns with suffix _12¶
In [22]:
num_cols_12 = df[cols_12].select_dtypes(include=['number']).columns.to_list()
# df[num_cols_12].hist(bins = 30, figsize = (20, 20), color = 'r');

#### Plot Histogram and Violin ####
plot_hist_grid(df, num_cols_12, bins=30, n_cols=4, figsize_per_plot=(5, 4), 
               title_fontsize=14, xlabel_fontsize=11, ylabel_fontsize=11, tick_fontsize=9
              )

plot_violin_grid(df, 
                 numeric_cols=num_cols_12, n_cols=2, figsize_per_plot=(10, 5), 
                 title_fontsize=18, tick_fontsize=14, show_outliers=True
                )
Plotting histograms...Total plots: 18

No description has been provided for this image
Plotting Violin Plots...Number of Plots: 18

No description has been provided for this image

Numerical features in rest of the columns¶

In [23]:
# Get the Numerical columns besides columns with suffix  `_03` and `_12`
num_cols_rest = df[cols_rest].select_dtypes(include=['number']).columns.to_list()
print(f"Numberical Features in rest of the columns: {num_cols_rest}\n")

# print("Plotting `PredictionYear`")
# mu = df['PredictionYear'].mean() # mean of distribution
# sigma = df['PredictionYear'].std() # standard deviation of distribution
# num_bins = 40
# df['PredictionYear'].plot.hist(bins = 50, alpha = 0.5, color = 'r', figsize = (6, 3))
# plt.ylabel('Num of Values')
# plt.xlabel('PredictionYear')
# plt.title('Histogram: mean = ' + str(round(mu, 4)) + ', sigma = ' + str(round(sigma, 4)))
# plt.grid()
# plt.show()

### Or We can treat this column as categorical as well since it has only two distinct values
# plot_categorical_distributions_plotly(df, ['PredictionYear'], width=600, height=300)

plot_categorical_distributions_seaborn(df, ['PredictionYear'], figsize = (6, 3))
Numberical Features in rest of the columns: ['composite_score', 'PredictionYear']

--- Plotting Categorical Distributions ---
Input columns: ['PredictionYear'], Number of Cols: 1
No description has been provided for this image

Target variable analysis¶

In [24]:
# plot_numeric_distribution_plotly(df, 'composite_score')
# plot_box_plotly(df, 'composite_score')

# # --- Plot | Plotly ---
# plot_histogram_and_boxplot(df, 'composite_score')

# --- Plot | SNS
plot_histogram_and_boxplot_sns(df, 'composite_score', figsize = (10, 4))
No description has been provided for this image
Examine Missing values¶
In [25]:
print("*" * 23)
print("Missing Values:")
print("*" * 23)
# # --- Plot | Plotly ---
# plot_missing_value_distribution(df)

# --- Plot | SNS
# Only Top N columns will be displayed
plot_missing_value_distribution_sns(df, top_n=30, figsize = (14, 8))


total_missing = df.isnull().sum().sum()
print(f"Total Missing Values: {total_missing}\n")
missing_percentage = pd.DataFrame(df.isnull().sum() * 100 / len(df), columns=["%age of Missing Values"])

print("*" * 41)
print("Percentage of Missing Values per Feature:")
print("*" * 41)
display(missing_percentage)

print("*" * 43)
print("Features with More than 40% Missing Values:")
print("*" * 43)

display(missing_percentage[missing_percentage["%age of Missing Values"] > 40])


print("*" * 43)
print("Features with less than 40% Missing Values:")
print("*" * 43)
missing_percentage[(missing_percentage["%age of Missing Values"] > 0) & (missing_percentage["%age of Missing Values"] < 40)]
***********************
Missing Values:
***********************
No description has been provided for this image
Total Missing Values: 60409

*****************************************
Percentage of Missing Values per Feature:
*****************************************
%age of Missing Values
UID 0.000000
Year 0.000000
composite_score 0.000000
Age_03 0.069228
Urban_03 0.000000
... ...
a22_12 98.407754
a33b_12 98.165455
SpeaksEnglish_12 7.268951
HousingEnvironment_12 3.080651
PredictionYear 0.000000

185 rows × 1 columns

*******************************************
Features with More than 40% Missing Values:
*******************************************
%age of Missing Values
EducationMother 59.432330
EducationFather 53.478712
JobHrsWeekly_03 49.775009
JobCatLongest_03 86.154379
YrJobEnded_03 84.215992
ReasonJobEnded_03 83.696781
JobHrsWeekly_12 68.535826
JobCatLongest_12 59.813084
YrJobEnded_12 90.031153
ReasonJobEnded_12 89.685012
a16a_12 98.857736
YrsLivedInUSA_12 98.165455
a22_12 98.407754
a33b_12 98.165455
*******************************************
Features with less than 40% Missing Values:
*******************************************
Out[25]:
%age of Missing Values
Age_03 0.069228
Marriages_03 0.969193
Education_03 0.484597
Num_Living_Child_03 0.588439
Migration_03 0.069228
... ...
Meet_FnF_12 7.407407
SocialActivities_12 7.165109
AttendReligiousServices_12 7.234337
SpeaksEnglish_12 7.268951
HousingEnvironment_12 3.080651

149 rows × 1 columns

Data Preparation¶

- Dropping Features¶
  • Drop Columns
In [26]:
# # These are rudundant features
# COLS_TO_DROP = ['UID', 'imss_03', 'imss_12', 'issste_03', 'issste_12', 'pem_def_mar_03', 'pem_def_mar_12',
#                    'insur_private_03', 'insur_private_12', 'insur_other_03', 'insur_other_12', 'seg_pop_12',
#                    'Tired_03', 'Tired_12', 'Happy_03', 'Happy_12']

# print(f"Redundant Features: {COLS_TO_DROP}")
# print(f"Number of Redundant Features: {len(COLS_TO_DROP)}\n")

# # We will Drop Features which have more than 70% missing values
# na_cols_to_drop = missing_percentage[missing_percentage["%age of Missing Values"] >= 70]
# na_cols_to_drop = na_cols_to_drop.index.to_list()
# print(f"Features with more than 70% missing values: {na_cols_to_drop}")
# print(f"Number of eatures with more than 70% missing values: {len(na_cols_to_drop)}\n")

# # Adding two columns
# COLS_TO_DROP.extend(na_cols_to_drop)

# print(f"Features to Drop: {COLS_TO_DROP}\n")
# print(f"Number of Features to Drop: {len(COLS_TO_DROP)}\n")

# # Now Dropping Features
# columns_actually_dropped = []
# for col in COLS_TO_DROP:
#     if col in df.columns:
#         df = df.drop(col, axis=1)
#         columns_actually_dropped.append(col)
#     else:
#         print(f"Warning: Column '{col}' not found in DataFrame. Skipping.")

# # Checking 
# check_lists = lambda COLS_TO_DROP, columns_actually_dropped: sorted(COLS_TO_DROP) == sorted(columns_actually_dropped)
# print("*" * 31)
# print("Features Dropped | Successfully" if check_lists(COLS_TO_DROP, columns_actually_dropped) else "Features Dropped | UnSuccessfull")
# print("*" * 31)
In [27]:
# These are rudundant features
COLS_TO_DROP = ['UID', 'imss_03', 'imss_12', 'issste_03', 'issste_12', 'pem_def_mar_03', 'pem_def_mar_12',
                   'insur_private_03', 'insur_private_12', 'insur_other_03', 'insur_other_12', 'seg_pop_12',
                   'Tired_03', 'Tired_12', 'Happy_03', 'Happy_12']

dropColumns = Pipeline([
    ('drop_columns', ColumnDropper(columns_to_drop=COLS_TO_DROP)),
])
print("*" * 80)
print(f"PipeLine | Drop Columns: {dropColumns}")
print("*" * 80)

# --- Applying Pipeline | Drop Columns ---
print("*" * 40)
print("--- Applying Pipeline | Drop Columns ---")
print("*" * 40)
df = dropColumns.fit_transform(df)
********************************************************************************
PipeLine | Drop Columns: Pipeline(steps=[('drop_columns',
                 ColumnDropper(columns_to_drop=['UID', 'imss_03', 'imss_12',
                                                'issste_03', 'issste_12',
                                                'pem_def_mar_03',
                                                'pem_def_mar_12',
                                                'insur_private_03',
                                                'insur_private_12',
                                                'insur_other_03',
                                                'insur_other_12', 'seg_pop_12',
                                                'Tired_03', 'Tired_12',
                                                'Happy_03', 'Happy_12']))])
********************************************************************************
****************************************
--- Applying Pipeline | Drop Columns ---
****************************************
ColumnDropper: Dropped columns: ['UID', 'imss_03', 'imss_12', 'issste_03', 'issste_12', 'pem_def_mar_03', 'pem_def_mar_12', 'insur_private_03', 'insur_private_12', 'insur_other_03', 'insur_other_12', 'seg_pop_12', 'Tired_03', 'Tired_12', 'Happy_03', 'Happy_12']
  • Drop Columns with High Missing Values
In [28]:
dropColumnsHighNA = Pipeline([
    ('drop_columns_high_na', DropColumnsHighNA(threshold=70.0)),
])
print("*" * 80)
print(f"PipeLine | Drop Columns With High Missing Values: {dropColumnsHighNA}")
print("*" * 80)

# --- Applying Pipeline | Drop Columns With High Missing Values ---
print("*" * 65)
print("--- Applying Pipeline | Drop Columns With High Missing Values ---")
print("*" * 65)
df = dropColumnsHighNA.fit_transform(df)
********************************************************************************
PipeLine | Drop Columns With High Missing Values: Pipeline(steps=[('drop_columns_high_na', DropColumnsHighNA())])
********************************************************************************
*****************************************************************
--- Applying Pipeline | Drop Columns With High Missing Values ---
*****************************************************************
DropColumnsHighNA: Dropped columns: ['JobCatLongest_03', 'YrJobEnded_03', 'ReasonJobEnded_03', 'YrJobEnded_12', 'ReasonJobEnded_12', 'a16a_12', 'YrsLivedInUSA_12', 'a22_12', 'a33b_12']
  • Impute Missing Values
In [29]:
# Impute using median for numerics, and mode for categoricals
# df_train_cleaned = impute_missing_values(df, num_strategy="median", cat_strategy="mode")
# Use mean for numerics, and fill categorical with a custom label e.g. `missing`
# df_clean = impute_missing_values(df, num_strategy="mean", cat_strategy="missing")
In [30]:
missingValueImputer = Pipeline([
    ('missing_value_imputer', MissingValueImputer(num_strategy="median", cat_strategy="mode")),
])
print("*" * 100)
print(f"PipeLine | Impute Missing Values: {missingValueImputer}")
print("*" * 100)

# --- Applying Pipeline | Impute Missing Values ---
print("*" * 50)
print("--- Applying Pipeline | Impute Missing Values ---")
print("*" * 50)
df = missingValueImputer.fit_transform(df)
****************************************************************************************************
PipeLine | Impute Missing Values: Pipeline(steps=[('missing_value_imputer', MissingValueImputer())])
****************************************************************************************************
**************************************************
--- Applying Pipeline | Impute Missing Values ---
**************************************************
MissingValueImputer: Fitted. Numerical imputers: {'composite_score': None, 'Marriages_03': 1.0, 'Num_ADL_03': 0.0, 'Num_IADL_03': 0.0, 'Num_CES-D_Symptoms_03': 3.0, 'Num_Illnesses_03': 1.0, 'Marriages_12': 1.0, 'Num_ADL_12': 0.0, 'Num_IADL_12': 0.0, 'Num_CES-D_Symptoms_12': 3.0, 'Num_Illnesses_12': 1.0, 'JobHrsWeekly_03': 45.0, 'Earnings_03': 0.0, 'SpouseEarnings_03': 0.0, 'hincome_03': 30000.0, 'hinc_business_03': 0.0, 'hinc_rent_03': 0.0, 'hinc_assets_03': 0.0, 'hinc_cap_03': 0.0, 'Pension_03': 0.0, 'SpousePension_03': 0.0, 'JobHrsWeekly_12': 36.0, 'Earnings_12': 0.0, 'SpouseEarnings_12': 0.0, 'hincome_12': 20000.0, 'hinc_business_12': 0.0, 'hinc_rent_12': 0.0, 'hinc_assets_12': 0.0, 'hinc_cap_12': 0.0, 'Pension_12': 0.0, 'SpousePension_12': 0.0, 'PredictionYear': None}, Categorical imputers: {'Year': None, 'Age_03': '50-59', 'Urban_03': None, 'Married_03': None, 'Education_03': '1-5 Years', 'Num_Living_Child_03': '3 or 4', 'Migration_03': 0.0, 'GlobalHealth_03': 'Fair', 'ADL_Dress_03': 0.0, 'ADL_Walk_03': 0.0, 'ADL_Bath_03': 0.0, 'ADL_Eat_03': 0.0, 'ADL_Bed_03': 0.0, 'ADL_Toilet_03': 0.0, 'IADL_Money_03': 0.0, 'IADL_Meds_03': 0.0, 'IADL_Shop_03': 0.0, 'IADL_Meals_03': 0.0, 'Depressed_03': 0.0, 'Hard_03': 0.0, 'Restless_03': 0.0, 'Lonely_03': 0.0, 'Enjoy_03': 1.0, 'Sad_03': 0.0, 'Energetic_03': 0.0, 'CES-D_Symptoms_03': 0.0, 'Hypertension_03': None, 'Diabetes_03': None, 'Respiratory_Illness_03': None, 'Arthritis_03': None, 'HeartAttack_03': 0.0, 'Stroke_03': None, 'Cancer_03': None, 'BMI_03': 'Over Weight', 'Exercise_03': 0.0, 'Alcohol_03': 0.0, 'Tobacco_03': 0.0, 'Test_Cholestrol_03': 1.0, 'Test_Tuber_03': 0.0, 'Test_Diabetes_03': 1.0, 'Test_BloodPress_03': 1.0, 'Hospitalized_03': None, 'Visit_Dr_03': 1.0, 'OutPatient_03': None, 'Visit_Dental_03': None, 'Insured_03': None, 'FamilyDecisions_03': 'Both', 'Employment_03': 'Currently Working', 'Age_12': '60-69', 'Urban_12': 'Urban', 'Married_12': 'Married or In Civil Union', 'Education_12': '1-5 Years', 'Num_Living_Child_12': '3 or 4', 'Migration_12': 0.0, 'GlobalHealth_12': 'Fair', 'ADL_Dress_12': 0.0, 'ADL_Walk_12': 0.0, 'ADL_Bath_12': 0.0, 'ADL_Eat_12': 0.0, 'ADL_Bed_12': 0.0, 'ADL_Toilet_12': 0.0, 'IADL_Money_12': 0.0, 'IADL_Meds_12': 0.0, 'IADL_Shop_12': 0.0, 'IADL_Meals_12': 0.0, 'Depressed_12': 0.0, 'Hard_12': 0.0, 'Restless_12': 0.0, 'Lonely_12': 0.0, 'Enjoy_12': 1.0, 'Sad_12': 0.0, 'Energetic_12': 0.0, 'CES-D_Symptoms_12': 0.0, 'Hypertension_12': 0.0, 'Diabetes_12': 0.0, 'Respiratory_Illness_12': 0.0, 'Arthritis_12': 0.0, 'HeartAttack_12': 0.0, 'Stroke_12': 0.0, 'Cancer_12': 0.0, 'BMI_12': 'Over Weight', 'Exercise_12': 0.0, 'Alcohol_12': 0.0, 'Tobacco_12': 0.0, 'Test_Cholestrol_12': 1.0, 'Test_Tuber_12': 0.0, 'Test_Diabetes_12': 1.0, 'Test_BloodPress_12': 1.0, 'Hospitalized_12': 0.0, 'Visit_Dr_12': 1.0, 'OutPatient_12': 0.0, 'Visit_Dental_12': 0.0, 'Insured_12': 1.0, 'FamilyDecisions_12': 'Both', 'Employment_12': 'Dedicated to Household Chores', 'Vax_Flu_12': 1.0, 'Vax_Pneu_12': 0.0, 'CareAdult_12': 0.0, 'CareChild_12': 0.0, 'Volunteer_12': 0.0, 'AttendsClass_12': 0.0, 'AttendsClub_12': 0.0, 'Reads_12': 1.0, 'Games_12': 0.0, 'TableGames_12': 0.0, 'UseElectronicDevices_12': 1.0, 'HouseMaintenance_12': 1.0, 'TV_12': 1.0, 'Sewing_12': 0.0, 'Satement_Ideal_12': 'Agrees', 'Satement_Excel_12': 'Agrees', 'Satement_Fine_12': 'Agrees', 'COSAS_IMP_12': 'Agrees', 'WouldntChange_12': 'Agrees', 'Memory_12': 'Fair', 'Gender': None, 'EducationMother': 'Some Primary', 'EducationFather': 'Some Primary', 'SpouseGender_03': 'Man', 'Religon_Imp_03': 'Very Important', 'SpouseGender_12': 'Man', 'JobCatLongest_12': 'Artisans and Workers in Production, Repair, Maintenance', 'Religon_Imp_12': 'Very Important', 'Meet_FnF_12': 'Never', 'SocialActivities_12': 'Never', 'AttendReligiousServices_12': 0.0, 'SpeaksEnglish_12': 0.0, 'HousingEnvironment_12': 'Wood, Mosaic, or other Covering'}
MissingValueImputer: Number of missing values after imputation: 0

Save to .CSV¶

In [31]:
# print(f"Path to Save Cleaned File: {path_train_cleaned}")
In [32]:
# df.to_csv(path_train_cleaned, index=False, encoding="utf8")
# print("Done")